{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "\n",
    "import re\n",
    "\n",
    "from pandas import read_csv\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n",
    "from sklearn import cross_validation\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.metrics import precision_score\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "from skmultilearn.problem_transform.br import BinaryRelevance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# the original file has been divided into 100 pieces\n",
    "# aa is but one piece\n",
    "df = read_csv(\"../data/pieces/aa\",\n",
    "    names=['id','title','body','tags'],\n",
    "    header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>body</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>How to check if an uploaded file is an image w...</td>\n",
       "      <td>&lt;p&gt;I'd like to check if an uploaded file is an...</td>\n",
       "      <td>php image-processing file-upload upload mime-t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"2\"</td>\n",
       "      <td>How can I prevent firefox from closing when I ...</td>\n",
       "      <td>&lt;p&gt;In my favorite editor (vim), I regularly us...</td>\n",
       "      <td>firefox</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\"3\"</td>\n",
       "      <td>R Error Invalid type (list) for variable</td>\n",
       "      <td>&lt;p&gt;I am import matlab file and construct a dat...</td>\n",
       "      <td>r matlab machine-learning</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\"4\"</td>\n",
       "      <td>How do I replace special characters in a URL?</td>\n",
       "      <td>&lt;p&gt;This is probably very simple, but I simply ...</td>\n",
       "      <td>c# url encoding</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\"5\"</td>\n",
       "      <td>How to modify whois contact details?</td>\n",
       "      <td>&lt;pre&gt;&lt;code&gt;function modify(.......) {   $mcont...</td>\n",
       "      <td>php api file-get-contents</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id                                              title  \\\n",
       "0     1  How to check if an uploaded file is an image w...   \n",
       "1   \"2\"  How can I prevent firefox from closing when I ...   \n",
       "2   \"3\"           R Error Invalid type (list) for variable   \n",
       "3   \"4\"      How do I replace special characters in a URL?   \n",
       "4   \"5\"               How to modify whois contact details?   \n",
       "\n",
       "                                                body  \\\n",
       "0  <p>I'd like to check if an uploaded file is an...   \n",
       "1  <p>In my favorite editor (vim), I regularly us...   \n",
       "2  <p>I am import matlab file and construct a dat...   \n",
       "3  <p>This is probably very simple, but I simply ...   \n",
       "4  <pre><code>function modify(.......) {   $mcont...   \n",
       "\n",
       "                                                tags  \n",
       "0  php image-processing file-upload upload mime-t...  \n",
       "1                                            firefox  \n",
       "2                          r matlab machine-learning  \n",
       "3                                    c# url encoding  \n",
       "4                          php api file-get-contents  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>body</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>How to check if an uploaded file is an image w...</td>\n",
       "      <td>&lt;p&gt;I'd like to check if an uploaded file is an...</td>\n",
       "      <td>php image-processing file-upload upload mime-t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>How can I prevent firefox from closing when I ...</td>\n",
       "      <td>&lt;p&gt;In my favorite editor (vim), I regularly us...</td>\n",
       "      <td>firefox</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>R Error Invalid type (list) for variable</td>\n",
       "      <td>&lt;p&gt;I am import matlab file and construct a dat...</td>\n",
       "      <td>r matlab machine-learning</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>How do I replace special characters in a URL?</td>\n",
       "      <td>&lt;p&gt;This is probably very simple, but I simply ...</td>\n",
       "      <td>c# url encoding</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>How to modify whois contact details?</td>\n",
       "      <td>&lt;pre&gt;&lt;code&gt;function modify(.......) {   $mcont...</td>\n",
       "      <td>php api file-get-contents</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  id                                              title  \\\n",
       "0  1  How to check if an uploaded file is an image w...   \n",
       "1  2  How can I prevent firefox from closing when I ...   \n",
       "2  3           R Error Invalid type (list) for variable   \n",
       "3  4      How do I replace special characters in a URL?   \n",
       "4  5               How to modify whois contact details?   \n",
       "\n",
       "                                                body  \\\n",
       "0  <p>I'd like to check if an uploaded file is an...   \n",
       "1  <p>In my favorite editor (vim), I regularly us...   \n",
       "2  <p>I am import matlab file and construct a dat...   \n",
       "3  <p>This is probably very simple, but I simply ...   \n",
       "4  <pre><code>function modify(.......) {   $mcont...   \n",
       "\n",
       "                                                tags  \n",
       "0  php image-processing file-upload upload mime-t...  \n",
       "1                                            firefox  \n",
       "2                          r matlab machine-learning  \n",
       "3                                    c# url encoding  \n",
       "4                          php api file-get-contents  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"id\"]=df[\"id\"].apply(lambda str: str.strip().lstrip('\"').rstrip('\"'))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pat = re.compile('<[^>]+>')\n",
    "def preprocessor(s):\n",
    "    return pat.sub(' ',s).lower()\n",
    "\n",
    "def tokenizer(s):\n",
    "    return s.split()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# i've doubled weight for the title as it should be more important\n",
    "text_data = df[\"title\"] + ' ' + df[\"title\"] + ' ' + df[\"body\"]\n",
    "vectorizerX = TfidfVectorizer(preprocessor=preprocessor, max_features=1000)\n",
    "# print(type(title_data))\n",
    "X = vectorizerX.fit_transform(text_data.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# now extract features for labels\n",
    "# regular features, no tfidf this time\n",
    "\n",
    "# maybe what I want is a CountVectorizer?\n",
    "tag_data = df[\"tags\"]\n",
    "token_pattern = '(?u)\\b\\w+\\b' # allow 1-letter tokens\n",
    "vectorizerY = CountVectorizer(tokenizer=tokenizer, token_pattern=token_pattern,max_features=1000)\n",
    "Y = vectorizerY.fit_transform(tag_data.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "60308"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "N = text_data.size\n",
    "N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((12061, 1000), (48247, 1000), (12061, 1000), (48247, 1000))"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)\n",
    "X_train.shape, X_test.shape, Y_train.shape, Y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring=\"f1_weighted\" instead of scoring=\"f1\".\n",
      "  sample_weight=sample_weight)\n",
      "/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "clf = BinaryRelevance(GaussianNB())\n",
    "clf.fit(X_train, Y_train)\n",
    "# cv = cross_validation.ShuffleSplit(n=Y.shape[0],n_iter=3, test_size=0.3,random_state=0)\n",
    "\n",
    "predictions = clf.predict(X_test)\n",
    "score = f1_score(y_test,predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.076051253563532928"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
